library(dplyr)
# Read in the data
# stu_data <- read.csv("data/sim_assessment_data.csv", stringsAsFactors = FALSE)
load("data/cohort_test_scores.rda")
head(sch_score, addrownums = FALSE)
load("data/agency.rda")
load("data/attend.rda")
load("data/expel.rda")
load("data/enrollment.rda")
load("data/staff.rda")
ls()
[1] "agency" "attend" "expel" "full_enroll" "sch_score"
[6] "staff"
full_data <- inner_join(sch_score, agency,
by = c("distid", "schid", "test_year"))
full_data <- inner_join(full_data, attend,
by = c("distid", "schid", "test_year", "school_year"))
full_data <- inner_join(full_data, expel,
by = c("distid", "schid", "test_year", "school_year"))
full_data <- inner_join(full_data %>% dplyr::select(-enrollment), full_enroll,
by = c("distid", "schid", "test_year", "school_year"))
full_data <- inner_join(full_data, staff,
by = c("distid", "schid", "test_year", "school_year"))
str(full_data)
'data.frame': 15087 obs. of 69 variables:
$ distid : num 275 222 288 287 174 290 148 88 174 174 ...
$ schid : int 9 1 11 10 120 3 2 1 129 64 ...
$ test_year : num 2007 2010 2007 2011 2011 ...
$ grade : num 7 4 4 4 4 7 5 8 4 5 ...
$ subject : chr "read" "read" "math" "math" ...
$ n1 : num 195 107 39 12 42 89 72 111 10 20 ...
$ ss1 : num 1213 1115 966 999 931 ...
$ n2 : num 199 109 41 12 33 75 76 108 21 36 ...
$ ss2 : num 1233 1166 1063 1071 1015 ...
$ school_year : chr "2006-07" "2009-10" "2006-07" "2010-11" ...
$ cesa : int 1 2 9 4 1 3 10 1 1 1 ...
$ county : int 40 13 37 32 40 25 9 40 40 40 ...
$ athl_conf : int 47 158 46 28 27 42 4 47 27 27 ...
$ locale : int 3 4 2 2 1 4 4 3 1 1 ...
$ locale_desc : chr "Urban Fringe of a Large City" "Urban Fringe of a Mid-Size City" "Mid-Size City" "Mid-Size City" ...
$ agency_type : chr "Public school" "Public school" "Public school" "Public school" ...
$ grade_group : chr "Middle School" "Elementary School" "Elementary School" "Elementary School" ...
$ low_grade : chr "06" "03" "KG" "K4" ...
$ high_grade : chr "08" "05" "05" "08" ...
$ school_size : chr "Large" "Medium" "Medium" "Small" ...
$ charter_ind : chr "No" "No" "No" "Yes" ...
$ title1a_program : chr "TASNF" "TAS" "TAS" "SWP" ...
$ title1a_program_desc : chr "Targeted Assistance eligible but not funded" "Targeted Assistance eligible and funded" "Targeted Assistance eligible and funded" "School-wide Program" ...
$ days_possible : num 107260 76496 53705 21606 72782 ...
$ days_attended : num 103245 73609 51537 20568 67523 ...
$ att_rate : num 96.3 96.2 96 95.2 92.8 ...
$ total_enroll : int 614 445 310 133 418 268 505 287 349 353 ...
$ suspension_rate : num 10.9 NA 2.9 NA 14.3 ...
$ suspension_count : num 67 0 9 0 60 0 2 4 204 102 ...
$ susp_rate_female : num 4.12 NA 0.6 NA 10.81 ...
$ susp_rate_male : num 17.03 NA 5.59 NA 17.17 ...
$ susp_count_female : num 12 0 1 0 20 0 1 1 70 25 ...
$ susp_count_male : num 55 0 8 0 40 0 1 3 134 77 ...
$ amer_ind_enroll : int 3 2 1 1 1 1 6 5 2 2 ...
$ asian_enroll : int 42 13 52 9 132 2 7 18 2 16 ...
$ black_enroll : int 104 13 15 0 232 5 14 59 336 284 ...
$ hispanic_enroll : int 28 16 9 5 24 4 8 9 0 11 ...
$ mutli_enroll : int 0 0 0 11 0 0 0 0 0 0 ...
$ pac_island_enroll : int 0 0 0 0 0 0 0 0 0 0 ...
$ white_enroll : int 437 401 233 107 29 256 470 196 9 40 ...
$ amer_ind_susp : num NA NA NA NA NA NA 0 NA NA NA ...
$ asian_susp : num 1 NA 1 0 1 NA 0 0 NA 0 ...
$ black_susp : num 25 NA 3 0 55 NA 0 4 198 92 ...
$ hispanic_susp : num NA 0 NA NA NA NA 0 NA 0 NA ...
$ multi_susp : num 0 0 0 0 0 0 0 0 0 0 ...
$ pac_island_susp : num 0 0 0 0 0 0 0 0 0 0 ...
$ white_susp : num 35 0 4 0 3 0 2 0 NA 7 ...
$ enrollment : num 612 445 311 133 418 268 505 287 349 353 ...
$ frl_per : num 17.6 16.6 45.7 36.1 82.1 ...
$ non_frl_per : num 82.3 83.4 54.3 63.9 17.9 ...
$ frl_count : num 108 74 142 48 343 71 282 29 328 296 ...
$ non_frl_count : num 504 371 169 85 75 197 223 258 21 57 ...
$ swd_per : num 9.64 14.38 7.72 4.51 14.11 ...
$ non_swd_per : num 90.4 85.6 92.3 95.5 85.9 ...
$ swd_count : num 59 64 24 6 59 49 79 34 57 69 ...
$ non_swd_count : num 553 381 287 127 359 219 426 253 292 284 ...
$ ell_per : num 5.23 2.7 19.94 NA 20.81 ...
$ non_ell_per : num 94.8 97.3 80.1 100 79.2 ...
$ ell_count : num 32 12 62 NA 87 2 2 26 NA 2 ...
$ non_ell_count : num 580 433 249 133 331 266 503 261 349 351 ...
$ total_enrollment : int 614 445 310 133 418 268 505 287 349 353 ...
$ admin_fte : num 2 1 0.8 0.33 1 1 1 0.7 2 1.5 ...
$ ratio_stu_to_admin : num 307 445 388 403 418 ...
$ support_staff_fte : num 14.51 13.93 15.78 0.79 9.95 ...
$ ratio_stu_to_supstaff : num 42.3 31.9 19.6 168.3 42 ...
$ licensed_fte : num 47.9 38.71 28.67 9.43 30.48 ...
$ ratio_stu_to_licensed : num 12.8 11.5 10.8 14.1 13.7 ...
$ total_staff : num 64.4 53.6 45.2 10.6 41.4 ...
$ ratio_students_to_allstaff: num 9.53 8.3 6.85 12.61 10.09 ...
rm(agency, attend, sch_score, expel, staff, full_enroll)
full_data %>% dplyr::select(test_year, grade, subject) %>%
distinct %>% nrow
[1] 50
full_data %>% dplyr::select(test_year, grade, subject) %>%
group_by(test_year, grade, subject) %>%
summarize(count = n()) %>%
pull(count) %>% summary
Min. 1st Qu. Median Mean 3rd Qu. Max.
152.0 220.5 253.0 301.7 425.8 469.0
# Load the packages we need to fit multiple models
library(tidyverse)
[30m── [1mAttaching packages[22m ──────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mggplot2[30m 2.2.1 [32m✔[30m [34mreadr [30m 1.1.1
[32m✔[30m [34mtibble [30m 1.4.2 [32m✔[30m [34mpurrr [30m 0.2.4
[32m✔[30m [34mtidyr [30m 0.8.0 [32m✔[30m [34mstringr[30m 1.2.0
[32m✔[30m [34mggplot2[30m 2.2.1 [32m✔[30m [34mforcats[30m 0.3.0[39m
package ‘tibble’ was built under R version 3.4.3package ‘tidyr’ was built under R version 3.4.3package ‘forcats’ was built under R version 3.4.3[30m── [1mConflicts[22m ─────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
library(broom) # to manipulate regression models
package ‘broom’ was built under R version 3.4.4
library(modelr) # to fit multiple models and get their attributes easily
Attaching package: ‘modelr’
The following object is masked from ‘package:broom’:
bootstrap
# Define a grouped dataframe using only the columns we need
# group by the grade, subject, and year
# Then nest so the data hangs together in one list
by_group <- full_data[, 1:10] %>%
group_by(grade, subject, test_year) %>%
nest()
# Define a simple function that fits our model
simple_model <- function(df) {
lm(ss2 ~ ss1, data = df)
}
# Apply that model to each group in our dataset using the `map` function
# This is equivalent to a loop, but more efficient to write
by_group <- by_group %>%
mutate(model = map(data, simple_model))
# To understand our model(s) - use the `glance` function to compute some
# summary statistics for each model
glance <- by_group %>%
mutate(glance = map(model, broom::glance)) %>%
unnest(glance, .drop = TRUE)
# Display the output
arrange(glance, desc(r.squared))
by_group <- full_data[, 1:10] %>%
group_by(grade, subject, test_year) %>%
nest()
head(by_group)
simple_model <- function(df) {
# df is a user specified parameter to the function
# ss2 and ss1 are variables that must be present in the df provided by the user
lm(ss2 ~ ss1, data = df)
}
head(by_group)
by_group <- by_group %>%
# take each element of the data column, and pass it individually as the
# first argument to the simple_model function
mutate(model = map(data, simple_model))
head(by_group)
simple_model <- function(df) {
# df is a user specified parameter to the function
# ss2 and ss1 are variables that must be present in the df provided by the user
lm(ss2 ~ ss1, data = df)
}
head(by_group)
by_group <- by_group %>%
# take each element of the data column, and pass it individually as the
# first argument to the simple_model function
mutate(model = map(data, simple_model))
head(by_group)
by_group <- inner_join(
by_group, # our original data
by_group %>%
mutate(coefs = map(model, coef)) %>% # get the coefficients out of the model
group_by(grade, subject, test_year) %>% # group the data
do(data.frame(t(unlist(.$coefs)))) # split each coefficient into a new column
)
Joining, by = c("grade", "subject", "test_year")
names(by_group)
[1] "grade" "subject" "test_year" "data" "model"
[6] "X.Intercept." "ss1"
# Replace the 6th elements name
names(by_group)[6] <- "est_intercept"
# Replace the 7th elements name
names(by_group)[7] <- "est_ss1"
ggplot(by_group) + scale_x_continuous(limits = c(700, 1600)) +
scale_y_continuous(limits = c(700, 1600)) +
geom_abline(data = by_group,
aes(slope = est_ss1, intercept = est_intercept),
alpha = I(0.5)) + theme_bw()
library(broom) # R package for working with regression models smoothly
# Create a new dataset with augmented values
m1 <- lm(ss2 ~ ss1 + test_year, data = full_data)
sch_score_m1 <- augment(m1) # generate the leverages and more for each row in data
names(sch_score_m1)
[1] "ss2" "ss1" "test_year" ".fitted" ".se.fit" ".resid"
[7] ".hat" ".sigma" ".cooksd" ".std.resid"
m1_inf <- influence.measures(m1)
head(m1_inf$infmat)
dfb.1_ dfb.ss1 dfb.tst_ dffit cov.r cook.d hat
1 -0.0055337529 -4.665108e-03 0.0055733067 -0.008151389 1.000443 2.214951e-05 0.0002900181
2 -0.0012741071 -7.049256e-05 0.0012756778 0.002160841 1.000292 1.556510e-06 0.0001017740
3 0.0120029041 -1.548576e-02 -0.0118488071 0.021686886 1.000362 1.567718e-04 0.0003979184
4 -0.0051323628 -5.058499e-03 0.0051808896 0.007940750 1.000505 2.101964e-05 0.0003424574
5 -0.0062581027 -9.502183e-03 0.0063484394 0.012028657 1.000687 4.823187e-05 0.0005412427
6 0.0005568328 8.676617e-04 -0.0005642642 0.001287116 1.000376 5.522590e-07 0.0001785945
plotdf <- sch_score_m1
plotdf$dfbeta_ss1 <- m1_inf$infmat[, 2]
ggplot(plotdf, aes(x = ss1, y = dfbeta_ss1)) +
geom_point(alpha=I(0.2)) +
geom_hline(yintercept = 2/sqrt(nrow(m1$model))) +
geom_hline(yintercept = -2/sqrt(nrow(m1$model))) +
facet_wrap(~test_year) +
geom_rug(alpha=1/4, position = "jitter") +
theme_bw()
m1 <- lm(ss2 ~ ss1 + grade, data = full_data)
sch_score_m1 <- augment(m1) # generate the leverages and more for each row in data
names(sch_score_m1)
[1] "ss2" "ss1" "grade" ".fitted" ".se.fit" ".resid"
[7] ".hat" ".sigma" ".cooksd" ".std.resid"
m1_inf <- influence.measures(m1)
head(m1_inf$infmat)
dfb.1_ dfb.ss1 dfb.grad dffit cov.r cook.d hat
1 0.0029948611 -0.0024907918 -0.0002853156 -0.005628864 1.000313 1.056193e-05 0.0001548384
2 -0.0011996910 0.0016424004 -0.0021515545 0.002486645 1.000458 2.061268e-06 0.0002637846
3 0.0158692584 -0.0133577263 0.0037853084 0.019118899 1.000239 1.218420e-04 0.0002905252
4 0.0037527312 -0.0027937983 -0.0001812284 0.005648069 1.000364 1.063415e-05 0.0001968821
5 0.0100151557 -0.0089359100 0.0038804435 0.010940911 1.000593 3.990312e-05 0.0004466684
6 -0.0006522582 0.0005234677 0.0001473935 0.001365502 1.000342 6.215727e-07 0.0001459713
plotdf <- sch_score_m1
plotdf$dfbeta_ss1 <- m1_inf$infmat[, 2]
ggplot(plotdf, aes(x = ss1, y = dfbeta_ss1)) +
geom_point(alpha=I(0.2)) +
geom_hline(yintercept = 2/sqrt(nrow(m1$model))) +
geom_hline(yintercept = -2/sqrt(nrow(m1$model))) +
facet_wrap(~grade) +
geom_rug(alpha=1/4, position = "jitter") +
theme_bw()
m1 <- lm(ss2 ~ ss1 + locale, data = full_data)
sch_score_m1 <- augment(m1) # generate the leverages and more for each row in data
names(sch_score_m1)
[1] ".rownames" "ss2" "ss1" "locale" ".fitted" ".se.fit"
[7] ".resid" ".hat" ".sigma" ".cooksd" ".std.resid"
m1_inf <- influence.measures(m1)
head(m1_inf$infmat)
dfb.1_ dfb.ss1 dfb.locl dffit cov.r cook.d hat
1 0.0033510034 -3.709778e-03 0.0018660197 -0.004795582 1.000363 7.666324e-06 1.849560e-04
2 0.0001768862 -8.971627e-05 0.0002034397 0.001504953 1.000266 7.550108e-07 6.907587e-05
3 0.0173100629 -1.590765e-02 -0.0025779229 0.019893416 1.000220 1.319128e-04 2.920650e-04
4 0.0054028866 -4.825047e-03 -0.0014531137 0.006799427 1.000371 1.541155e-05 2.121335e-04
5 0.0105396391 -9.591768e-03 -0.0029284416 0.012182271 1.000559 4.947142e-05 4.260801e-04
6 -0.0011778973 1.258906e-03 -0.0001999615 0.001718139 1.000348 9.840664e-07 1.486204e-04
plotdf <- sch_score_m1
plotdf$dfbeta_ss1 <- m1_inf$infmat[, 2]
ggplot(plotdf, aes(x = ss1, y = dfbeta_ss1)) +
geom_point(alpha=I(0.2)) +
geom_hline(yintercept = 2/sqrt(nrow(m1$model))) +
geom_hline(yintercept = -2/sqrt(nrow(m1$model))) +
facet_wrap(~locale) +
geom_rug(alpha=1/4, position = "jitter") +
theme_bw()
m1 <- lm(ss2 ~ ss1 + school_size, data = full_data)
sch_score_m1 <- augment(m1) # generate the leverages and more for each row in data
names(sch_score_m1)
[1] "ss2" "ss1" "school_size" ".fitted" ".se.fit" ".resid"
[7] ".hat" ".sigma" ".cooksd" ".std.resid"
m1_inf <- influence.measures(m1)
head(m1_inf$infmat)
dfb.1_ dfb.ss1 dfb.sc_M dfb.sc_S dffit cov.r cook.d
1 0.0028501820 -0.0036693401 0.0043644291 2.227578e-03 -0.007648585 1.000437 1.462594e-05
2 -0.0003493112 0.0003517784 0.0016733533 -9.311227e-08 0.002675801 1.000363 1.790089e-06
3 0.0151459689 -0.0152529462 0.0039476843 4.037305e-06 0.019426971 1.000215 9.434997e-05
4 0.0028291398 -0.0028491222 -0.0006159071 7.387195e-03 0.008332229 1.001928 1.735761e-05
5 0.0086038269 -0.0086645965 0.0013901333 2.293433e-06 0.010219651 1.000594 2.611159e-05
6 -0.0018078732 0.0018206424 0.0014869494 -4.819061e-07 0.002571296 1.000482 1.652997e-06
hat
1 0.0002369927
2 0.0001139771
3 0.0002920269
4 0.0016702124
5 0.0003983572
6 0.0002246230
plotdf <- sch_score_m1
plotdf$dfbeta_ss1 <- m1_inf$infmat[, 2]
ggplot(plotdf, aes(x = ss1, y = dfbeta_ss1)) +
geom_point(alpha=I(0.2)) +
geom_hline(yintercept = 2/sqrt(nrow(m1$model))) +
geom_hline(yintercept = -2/sqrt(nrow(m1$model))) +
facet_wrap(~school_size) +
geom_rug(alpha=1/4, position = "jitter") +
theme_bw()
m1 <- lm(ss2 ~ ss1 + charter_ind, data = full_data)
sch_score_m1 <- augment(m1) # generate the leverages and more for each row in data
names(sch_score_m1)
[1] "ss2" "ss1" "charter_ind" ".fitted" ".se.fit" ".resid"
[7] ".hat" ".sigma" ".cooksd" ".std.resid"
m1_inf <- influence.measures(m1)
head(m1_inf$infmat)
dfb.1_ dfb.ss1 dfb.ch_Y dffit cov.r cook.d hat
1 0.0039305322 -4.228829e-03 0.0007953044 -0.005644070 1.000316 1.061907e-05 1.575692e-04
2 0.0001577928 -2.658701e-05 -0.0003471285 0.001716615 1.000260 9.823182e-07 6.912986e-05
3 0.0168974875 -1.623667e-02 -0.0017253866 0.018692335 1.000234 1.164659e-04 2.815356e-04
4 0.0034793306 -3.489553e-03 0.0116998413 0.012415386 1.001942 5.138371e-05 1.756917e-03
5 0.0092070389 -8.917964e-03 -0.0007521856 0.009824060 1.000543 3.217233e-05 3.927833e-04
6 -0.0009274549 1.002442e-03 -0.0001998413 0.001375589 1.000344 6.307896e-07 1.473811e-04
plotdf <- sch_score_m1
plotdf$dfbeta_ss1 <- m1_inf$infmat[, 2]
ggplot(plotdf, aes(x = ss1, y = dfbeta_ss1)) +
geom_point(alpha=I(0.2)) +
geom_hline(yintercept = 2/sqrt(nrow(m1$model))) +
geom_hline(yintercept = -2/sqrt(nrow(m1$model))) +
facet_wrap(~charter_ind) +
geom_rug(alpha=1/4, position = "jitter") +
theme_bw()
install.packages("cowplot")
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/cowplot_0.9.3.tgz'
Content type 'application/x-gzip' length 2412869 bytes (2.3 MB)
==================================================
downloaded 2.3 MB
The downloaded binary packages are in
/var/folders/6w/j5943tln5437dxkgq0cfjcp80000gn/T//RtmpdXYmb2/downloaded_packages
by_group <- by_group %>%
mutate(
resids = map2(data, model, add_residuals),
preds = map2(data, model, add_predictions)
)
by_group
# unnest expands the data out from a column
plotdf <- left_join(unnest(by_group, resids),
unnest(by_group, preds))
Joining, by = c("grade", "subject", "test_year", "est_intercept", "est_ss1", "distid", "schid", "n1", "ss1", "n2", "ss2", "school_year")
# Residual plot
plotdf %>%
ggplot(aes(x = pred, y = resid)) +
geom_point(alpha = I(0.2)) + theme_bw() +
geom_smooth()
m2 <- lm(ss2 ~ ss1 + grade + distid, data = full_data)
full_data$predictedm2 <- predict(m2)
full_data$residualsm2 <- residuals(m2)
ggplot(data=full_data, aes(x = predictedm2, y = residualsm2)) +
geom_point(alpha = I(0.2)) + theme_bw() +
geom_smooth()
m3 <- lm(ss2 ~ ss1 + subject + locale, na.action=na.exclude, data = full_data)
ggplot(data=full_data, aes(x = predict(m3), y = residuals(m3))) +
geom_point(alpha = I(0.2)) + theme_bw() +
geom_smooth()
m4 <- lm(ss2 ~ ss1 + grade + test_year, na.action=na.exclude, data = full_data)
ggplot(data=full_data, aes(x = predict(m4), y = residuals(m4))) +
geom_point(alpha = I(0.2)) + theme_bw() +
geom_smooth()
m5 <- lm(ss2 ~ ss1 + grade + locale, na.action=na.exclude, data = full_data)
ggplot(data=full_data, aes(x = predict(m5), y = residuals(m5))) +
geom_point(alpha = I(0.2)) + theme_bw() +
geom_smooth()
m6 <- lm(ss2 ~ ss1 + grade + frl_per, na.action=na.exclude, data = full_data)
ggplot(data=full_data, aes(x = predict(m6), y = residuals(m6))) +
geom_point(alpha = I(0.2)) + theme_bw() +
geom_smooth()
m7 <- lm(ss2 ~ ss1 + enrollment + grade, na.action=na.exclude, data = full_data)
ggplot(data=full_data, aes(x = predict(m7), y = residuals(m7))) +
geom_point(alpha = I(0.2)) + theme_bw() +
geom_smooth()
by_group1 <- full_data[, 1:28] %>%
group_by(grade, subject, test_year) %>%
nest()
Warning messages:
1: Unknown or uninitialised column: 'model'.
2: Unknown or uninitialised column: 'model'.
3: Unknown or uninitialised column: 'model'.
head(by_group1)
# Define a grouped dataframe using only the columns we need (1:10)
# group by the grade, subject, and year
# Then nest so the data hangs together in one list
by_group <- full_data[, 1:10] %>%
group_by(grade, subject, test_year) %>%
nest()
# Define a simple function that fits our model
simple_model <- function(df)
lm(ss2 ~ ss1, data = full_data)
alt_model <- function(df)
lm(ss2 ~ ss1 + distid + grade, data = full_data)
# Apply that model to each group in our dataset using the `map` function
# This is equivalent to a loop, but more efficient to write
by_group <- by_group %>%
mutate(base_model = map(data, simple_model),
full_model = map(data, alt_model))
# Apply a function that makes an F-test of our two models (x and y)
# returns only the statistics from the anova object we want
# in this case, the difference in the sum of squares and the p-value of the
# F-test
simple_anova <- function(x, y, stat = c("ss", "p"), ...){
out <- anova(x, y)
if(stat == "ss"){
return(out$`Sum of Sq`[[2]])
} else if(stat == "p"){
return(out$`Pr(>F)`[[2]])
}
}
ftests <- by_group %>% rowwise() %>%
do(ss = unlist(simple_anova(x = .$base_model, y = .$full_model, stat = "ss")),
pval = unlist(simple_anova(x = .$base_model, y = .$full_model, stat = "p")))
ftests <- apply(ftests, 2, unlist) # convert to a numeric object
ftests <- as.data.frame(ftests) # make into a data.frame for ease of use
table(ftests$pval < 0.05)
TRUE
50
# ss2 ~ ss1 + distid + grade